library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(tibble)
library(cowplot)
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)
library(viridisLite)
theme_set(theme_classic())
data_dir = "/home/zli4/MSK_DEC24/Analysis"
out_dir = "/home/zli4/MSK_DEC24/Analysis/outs/differential_expression"
tp = "D-1"
print(tp)## [1] "D-1"
f_pseudobulk_list = file.path(out_dir,"pseudobulk_list.RDS")
pseudo_obj_list = readRDS(f_pseudobulk_list)min_cells = 30
pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
pseudo_obj%>%subset(subset = n>=min_cells)
})geno2test_tp = readRDS( file.path(out_dir,"geno2test_tp.RDS"))geno2test = geno2test_tp[[tp]]
pseudo_obj = pseudo_obj_list[[tp]]res_all = NULL
for(ct in unique(geno2test$celltype)){
print("==========")
print(paste0("working on ", ct))
print("==========")
geno2use = geno2test%>%filter(celltype==ct)%>%pull(genotype)
geno2use = as.character(geno2use)
stopifnot(length(geno2use) > 0)
dat = pseudo_obj%>%subset(celltype ==ct)%>%subset(genotype%in%(union(geno2use,"WT")))
dat_mat = GetAssayData(dat, layer = 'counts')
dat_mat <- dat_mat[!grepl("^ENSG", rownames(dat_mat)), ] # remove genes without gene symbols
n_non_zero = rowSums(dat_mat > 0)
dat_mat = dat_mat[n_non_zero >= ncol(dat_mat) * 0.5, ]
meta_df = dat[[]]
if(!"WT"%in%unique(meta_df$genotype)){
next
}
meta_df$genotype = factor(meta_df$genotype)
meta_df$genotype = relevel(meta_df$genotype, ref="WT")
meta_df$log10_rd = log10(colSums(dat_mat))
summary(meta_df$log10_rd)
if(nrow(meta_df)<=5){# check number of samples
next
}
# create dds object
if(length(unique(meta_df$source))>1){ # pseudobulk samples come from two sources (both external_WT and KO_village)
if(all(rowSums(table(meta_df$genotype,meta_df$source)!=0)==1)){
# when all WT samples come from one data source while KO samples the other
next
}
if(length(unique(meta_df$background))==1){
print("===== all pseudobulk samples come from one background ===== ")
# all pseudobulk samples come from one background
dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df,
~ genotype + log10_rd + source)
}else{
dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df,
~ genotype + log10_rd + background + source)
}
}else{ # all pseudobulk samples come from one source
print("===== all pseudobulk samples come from one source===== ")
if(length(unique(meta_df$background))==1){
print("===== all pseudobulk samples come from one background ===== ")
dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df,
~ genotype + log10_rd )
}else{
dds = DESeqDataSetFromMatrix(dat_mat, colData=meta_df,
~ genotype + log10_rd + background)
}
}
vsd = vst(dds, blind=FALSE)
# run PCA
pcs = plotPCA(vsd, intgroup = c("genotype", "background", "source"), returnData = TRUE)
genotypes2plot = setdiff(pcs$genotype, "WT")
# PCA plot: color by genotype, shape by cell background
plot_lists = list()
for (geno1 in genotypes2plot) {
pcs$highlight <- ifelse(
pcs$genotype == "WT", "WT",
ifelse(pcs$genotype == geno1, "KO", "Other")
)
p1 = ggplot(pcs, aes(x = PC1, y = PC2, color = highlight,
alpha = highlight, shape = background)) +
geom_point(size = 2) +
scale_color_manual(values = c("WT" = "blue", "KO" = "red",
"Other" = "grey")) +
scale_alpha_manual(values = c("WT" = 1, "KO" = 1, "Other" = 0.5)) +
labs(title = paste(ct, geno1)) +
guides(
color = "none",
alpha = "none",
shape = guide_legend(title = "")
)
plot_lists[[geno1]] = p1
}
i = 1
while (i <= length(plot_lists)) {
if (i < length(plot_lists)) {
grid.arrange(plot_lists[[i]], plot_lists[[i + 1]], ncol = 2)
} else {
grid.arrange(plot_lists[[i]], ncol = 2)
}
i <- i + 2
Sys.sleep(1)
}
# PCA plot: color by genotype, shape by source
plot_lists = list()
for (geno1 in genotypes2plot) {
pcs$highlight <- ifelse(
pcs$genotype == "WT", "WT",
ifelse(pcs$genotype == geno1, "KO", "Other")
)
p1 = ggplot(pcs, aes(x = PC1, y = PC2, color = highlight,
alpha = highlight, shape = source)) +
geom_point(size = 2) +
scale_color_manual(values = c("WT" = "blue", "KO" = "red",
"Other" = "grey")) +
scale_alpha_manual(values = c("WT" = 1, "KO" = 1, "Other" = 0.5)) +
labs(title = paste(ct, geno1)) +
guides(
color = "none",
alpha = "none",
shape = guide_legend(title = "")
)
plot_lists[[geno1]] = p1
}
i = 1
while (i <= length(plot_lists)) {
if (i < length(plot_lists)) {
grid.arrange(plot_lists[[i]], plot_lists[[i + 1]], ncol = 2)
} else {
grid.arrange(plot_lists[[i]], ncol = 2)
}
i <- i + 2
Sys.sleep(1)
}
dds = DESeq(dds)
# check cell background effect
if(length(unique(meta_df$background))>1){
res_k = results(dds, contrast = c("background", "HUES8", "H1"))
res_k = as.data.frame(res_k)
g1 = ggplot(res_k, aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("genetic background") +
guides(fill=guide_legend(ncol=2))
w_de = which(res_k$padj < 0.05)
res_k$delabel = NA
res_k$delabel[w_de] = rownames(res_k)[w_de]
res_k$diffexpressed = "No"
res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=diffexpressed, label=delabel)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
scale_color_manual(values=col2use)
print(ggarrange(g1, g2, widths = c(2,3)))
saveRDS(res_k,
file=file.path(out_dir, paste0(ct,"_DE_pseudobulk_background_effect.RDS",sep = "")))
}
if(length(unique(meta_df$source))>1){
# check data source effect
res_k = results(dds, contrast = c("source", "external_WT", "KO_village" ))
res_k = as.data.frame(res_k)
g1 = ggplot(res_k, aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("data source") +
guides(fill=guide_legend(ncol=2))
w_de = which(res_k$padj < 0.05)
res_k$delabel = NA
res_k$delabel[w_de] = rownames(res_k)[w_de]
res_k$diffexpressed = "No"
res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=diffexpressed, label=delabel)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
scale_color_manual(values=col2use)
print(ggarrange(g1, g2, widths = c(2,3)))
saveRDS(res_k,
file=file.path(out_dir, paste0(ct,"_DE_pseudobulk_source_effect.RDS",sep = "")))
}
# KO vs WT DE analysis conditioned on the given cell type
for(k in 1:length(geno2use)){
nm_k = paste(ct, geno2use[k])
res_k = results(dds, contrast = c("genotype", geno2use[k], "WT"))
res_k = as.data.frame(res_k)
table(is.na(res_k$pvalue))
g1 = ggplot(res_k, aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(nm_k) +
guides(fill=guide_legend(ncol=2))
w_de = which(res_k$padj < 0.05)
res_k$delabel = NA
res_k$delabel[w_de] = rownames(res_k)[w_de]
res_k$diffexpressed = "No"
res_k$diffexpressed[res_k$log2FoldChange > 0 & res_k$padj < 0.05] <- "Up"
res_k$diffexpressed[res_k$log2FoldChange < 0 & res_k$padj < 0.05] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=diffexpressed, label=delabel)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
scale_color_manual(values=col2use) +
ggtitle(nm_k)
print(ggarrange(g1, g2, widths = c(2,3)))
res_k$celltype = ct
res_k$genotype = geno2use[k]
res_k$gene = rownames(res_k)
rownames(res_k) = NULL
res_all = rbind(res_all, res_k)
}
}## [1] "=========="
## [1] "working on ESC"
## [1] "=========="
## [1] "===== all pseudobulk samples come from one source===== "
saveRDS(res_all,
file=file.path(out_dir,paste0( "DE_pseudobulk_",tp,".RDS")))gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8219604 439 14714865 785.9 14714865 785.9
## Vcells 101842129 777 251290144 1917.2 314112680 2396.5
sessionInfo()## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] viridisLite_0.4.2 lubridate_1.9.3
## [3] forcats_1.0.0 readr_2.1.5
## [5] tidyverse_2.0.0 ggpubr_0.6.0
## [7] ggrepel_0.9.5 gridExtra_2.3
## [9] DESeq2_1.44.0 SummarizedExperiment_1.34.0
## [11] Biobase_2.64.0 MatrixGenerics_1.16.0
## [13] matrixStats_1.3.0 GenomicRanges_1.56.0
## [15] GenomeInfoDb_1.40.0 IRanges_2.38.0
## [17] S4Vectors_0.42.0 BiocGenerics_0.50.0
## [19] readxl_1.4.3 reshape2_1.4.4
## [21] pheatmap_1.0.12 stringr_1.5.1
## [23] patchwork_1.2.0 cowplot_1.1.3
## [25] tibble_3.2.1 ggplot2_3.5.1
## [27] purrr_1.0.2 tidyr_1.3.1
## [29] dplyr_1.1.4 Matrix_1.7-0
## [31] Seurat_5.1.0 SeuratObject_5.0.2
## [33] sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.0 later_1.3.2
## [4] cellranger_1.1.0 polyclip_1.10-6 fastDummies_1.7.3
## [7] lifecycle_1.0.4 rstatix_0.7.2 globals_0.16.3
## [10] lattice_0.22-6 MASS_7.3-60.2 backports_1.4.1
## [13] magrittr_2.0.3 plotly_4.10.4 sass_0.4.9
## [16] rmarkdown_2.26 jquerylib_0.1.4 yaml_2.3.8
## [19] httpuv_1.6.15 sctransform_0.4.1 spam_2.10-0
## [22] spatstat.sparse_3.0-3 reticulate_1.36.1 pbapply_1.7-2
## [25] RColorBrewer_1.1-3 abind_1.4-5 zlibbioc_1.50.0
## [28] Rtsne_0.17 GenomeInfoDbData_1.2.12 irlba_2.3.5.1
## [31] listenv_0.9.1 spatstat.utils_3.0-4 goftest_1.2-3
## [34] RSpectra_0.16-1 spatstat.random_3.2-3 fitdistrplus_1.2-1
## [37] parallelly_1.37.1 leiden_0.4.3.1 codetools_0.2-20
## [40] DelayedArray_0.30.1 tidyselect_1.2.1 UCSC.utils_1.0.0
## [43] farver_2.1.2 spatstat.explore_3.2-7 jsonlite_1.8.8
## [46] progressr_0.14.0 ggridges_0.5.6 survival_3.6-4
## [49] tools_4.4.0 ica_1.0-3 Rcpp_1.0.12
## [52] glue_1.7.0 SparseArray_1.4.3 xfun_0.44
## [55] withr_3.0.0 fastmap_1.2.0 fansi_1.0.6
## [58] digest_0.6.35 timechange_0.3.0 R6_2.5.1
## [61] mime_0.12 colorspace_2.1-0 scattermore_1.2
## [64] tensor_1.5 spatstat.data_3.0-4 utf8_1.2.4
## [67] generics_0.1.3 data.table_1.15.4 httr_1.4.7
## [70] htmlwidgets_1.6.4 S4Arrays_1.4.0 uwot_0.2.2
## [73] pkgconfig_2.0.3 gtable_0.3.5 lmtest_0.9-40
## [76] XVector_0.44.0 htmltools_0.5.8.1 carData_3.0-5
## [79] dotCall64_1.1-1 scales_1.3.0 png_0.1-8
## [82] knitr_1.46 rstudioapi_0.16.0 tzdb_0.4.0
## [85] nlme_3.1-164 cachem_1.0.8 zoo_1.8-12
## [88] KernSmooth_2.23-22 parallel_4.4.0 miniUI_0.1.1.1
## [91] pillar_1.9.0 grid_4.4.0 vctrs_0.6.5
## [94] RANN_2.6.1 promises_1.3.0 car_3.1-2
## [97] xtable_1.8-4 cluster_2.1.6 evaluate_0.23
## [100] cli_3.6.2 locfit_1.5-9.9 compiler_4.4.0
## [103] rlang_1.1.3 crayon_1.5.2 future.apply_1.11.2
## [106] ggsignif_0.6.4 labeling_0.4.3 plyr_1.8.9
## [109] stringi_1.8.4 deldir_2.0-4 BiocParallel_1.38.0
## [112] munsell_0.5.1 lazyeval_0.2.2 spatstat.geom_3.2-9
## [115] RcppHNSW_0.6.0 hms_1.1.3 future_1.33.2
## [118] shiny_1.8.1.1 highr_0.10 ROCR_1.0-11
## [121] igraph_2.0.3 broom_1.0.5 bslib_0.7.0